pacman::p_load(sf, sfdep, tmap, tidyverse, RColorBrewer, ggplot2, spatstat, jsonlite, units, matrixStats)Take Home Exercise 3
1. Overview
1.1 Introduction
1.2 My Responsibilities
- Data Preparation, Preprocessing
1.3 Importing Packages
Here, we have loaded the following packages:
2. The Data
For this project, we will be using the following data sets.
Singapore Resale Flat Prices (Jan-17 to Sep-24) from Kaggle, an accumulation of information relating to the sale of Singapore’s public housing apartments colloquially referred to as flats
This dataset augments the original dataset by including 4 important categories of information:
X/Y lat/lng coordinates, which can be used for geospatial plotting.
Information about the closest MRT station to the flat
Information about the closest primary school to flat
the URA planning area (or town) of the flat.
Master Plan 2014 Subzone Boundary (Web) from data.gov.sg
HDB Property Info from data.gov.sg
Hawker Centres Dataset from data.gov.sg
Kindergarten, Childcare datasets from OneMap API
Bus Stops Location from LTA Data Mall
Shopping Mall Coordinates from Kaggle
2.1 Importing Geospatial Data
2.1.1 Importing Singapore Subzone Boundaries
The code chunk below is used to import MP_SUBZONE_WEB_PL shapefile by using st_read() of sfpackages.
mpsz_sf <- st_read(dsn = "data/geospatial/MasterPlan2014SubzoneBoundaryWebSHP/", layer = "MP14_SUBZONE_WEB_PL")Reading layer `MP14_SUBZONE_WEB_PL' from data source
`/Users/georgiaxng/georgiaxng/is415-handson/Take-home_Ex/Take-home_Ex03/data/geospatial/MasterPlan2014SubzoneBoundaryWebSHP'
using driver `ESRI Shapefile'
Simple feature collection with 323 features and 15 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
Projected CRS: SVY21
write_rds(mpsz_sf, 'data/rds/mpsz_sf.rds')st_crs(mpsz_sf)Coordinate Reference System:
User input: SVY21
wkt:
PROJCRS["SVY21",
BASEGEOGCRS["SVY21[WGS84]",
DATUM["World Geodetic System 1984",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]],
ID["EPSG",6326]],
PRIMEM["Greenwich",0,
ANGLEUNIT["Degree",0.0174532925199433]]],
CONVERSION["unnamed",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["Degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["Degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["(E)",east,
ORDER[1],
LENGTHUNIT["metre",1,
ID["EPSG",9001]]],
AXIS["(N)",north,
ORDER[2],
LENGTHUNIT["metre",1,
ID["EPSG",9001]]]]
2.1.1.1 Checking Validity of Geometries
Using st_is_valid, we can check to see whether all the polygons are valid or not. From the results, we can see a total of 9 not valid.
# checks for the number of geometries that are invalid
length(which(st_is_valid(mpsz_sf) == FALSE))[1] 9
To rectify this, we can use st_make_valid() to correct these invalid geometries as demonstrated in the code chunk below.
mpsz_sf <- st_make_valid(mpsz_sf)
length(which(st_is_valid(mpsz_sf) == FALSE))[1] 0
Here is a plot of Singapore.
plot(mpsz_sf)2.1.2 Importing Kindergartens
Click to view the code
kindergarten_json <- fromJSON("data/geospatial/kindergartens.json")
kindergarten_cleaned <- kindergarten_json$SrchResults[-1, ]
kindergarten_df <- data.frame(
NAME = kindergarten_cleaned$NAME,
latitude = sapply(kindergarten_cleaned$LatLng, function(x) as.numeric(unlist(strsplit(x, ","))[1])),
longitude = sapply(kindergarten_cleaned$LatLng, function(x) as.numeric(unlist(strsplit(x, ","))[2]))
)
kindergarten_sf <- kindergarten_df %>%
st_as_sf(coords = c("longitude", "latitude"), crs=4326) %>%
st_transform(crs = 3414)tmap_mode('plot')
tm_shape(mpsz_sf) +
tm_polygons()+
tm_shape(kindergarten_sf) +
tm_dots()2.1.3 Importing Childcare
Click to view the code
childcare_json <- fromJSON("data/geospatial/childcare.json")
childcare_cleaned <- childcare_json$SrchResults[-1, ]
childcare_df <- data.frame(
NAME = childcare_cleaned$NAME,
latitude = sapply(childcare_cleaned$LatLng, function(x) as.numeric(unlist(strsplit(x, ","))[1])),
longitude = sapply(childcare_cleaned$LatLng, function(x) as.numeric(unlist(strsplit(x, ","))[2]))
)
childcare_sf <- childcare_df %>%
st_as_sf(coords = c("longitude", "latitude"), crs=4326) %>%
st_transform(crs = 3414)tmap_mode('plot')
tm_shape(mpsz_sf) +
tm_polygons()+
tm_shape(childcare_sf) +
tm_dots()2.1.4 Importing Hawker Centre
hawker_sf <- st_read('data/geospatial/HawkerCentresGEOJSON.geojson')Reading layer `HawkerCentresGEOJSON' from data source
`/Users/georgiaxng/georgiaxng/is415-handson/Take-home_Ex/Take-home_Ex03/data/geospatial/HawkerCentresGEOJSON.geojson'
using driver `GeoJSON'
Simple feature collection with 125 features and 2 fields
Geometry type: POINT
Dimension: XYZ
Bounding box: xmin: 103.6974 ymin: 1.272716 xmax: 103.9882 ymax: 1.449017
z_range: zmin: 0 zmax: 0
Geodetic CRS: WGS 84
# Function to extract name from description
extract_name <- function(description) {
if (!is.na(description)) {
# Use regular expression to extract the NAME
name <- sub(".*<th>NAME</th> <td>(.*?)</td>.*", "\\1", description)
if (name == description) {
return(NA) # Return NA if no match is found
}
return(name)
} else {
return(NA)
}
}
# Apply the extraction function to every row
hawker_sf <- hawker_sf %>%
mutate(Name = sapply(Description, extract_name)) %>% select (-Description)
# Verify the results
glimpse(hawker_sf)Rows: 125
Columns: 2
$ Name <chr> "Market Street Hawker Centre", "Marsiling Mall Hawker Centre"…
$ geometry <POINT [°]> POINT Z (103.8502 1.284425 0), POINT Z (103.7798 1.4335…
As shown above, we can see that the geographic coordinate system for the hawker dataset is in WGS84 and has XYZ coordinates, among which contains the Z-coordinates we do not need. Thus, we can use st_zm() to remove the Z-coordinate and project it to the SVY21 coordiate system using st_transform().
hawker_sf <- st_zm(hawker_sf) %>%
st_transform(crs = 3414)
head(hawker_sf)Simple feature collection with 6 features and 1 field
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 22042.51 ymin: 29650.7 xmax: 35955.52 ymax: 47850.43
Projected CRS: SVY21 / Singapore TM
Name geometry
1 Market Street Hawker Centre POINT (29874.82 29650.7)
2 Marsiling Mall Hawker Centre POINT (22042.51 46139.03)
3 Margaret Drive Hawker Centre POINT (24816.7 31094.91)
4 Fernvale Hawker Centre & Market POINT (32867.9 41500.77)
5 One Punggol Hawker Centre POINT (35955.52 43336.13)
6 Bukit Canberra Hawker Centre POINT (26794.39 47850.43)
tmap_mode('plot')
tm_shape(mpsz_sf) +
tm_polygons()+
tm_shape(hawker_sf) +
tm_dots()2.1.5 Importing Bus Stops
busstop_sf <- st_read(dsn = "data/geospatial/BusStopLocation_Jul2024/", layer = "BusStop")%>%
st_transform(crs = 3414)Reading layer `BusStop' from data source
`/Users/georgiaxng/georgiaxng/is415-handson/Take-home_Ex/Take-home_Ex03/data/geospatial/BusStopLocation_Jul2024'
using driver `ESRI Shapefile'
Simple feature collection with 5166 features and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 3970.122 ymin: 26482.1 xmax: 48285.52 ymax: 52983.82
Projected CRS: SVY21
2.1.6 Importing Shopping Malls
shoppingmall_sf <- read_csv('data/geospatial/shopping_mall_coordinates.csv') %>%
st_as_sf(coords = c("LONGITUDE", "LATITUDE"), crs=4326) %>%
st_transform(crs = 3414)2.2 Importing Aspatial Data
2.2.1 Importing Resale Flat Prices
The code chunk below is used to import the Resale Flat Prices dataset from Kaggle.
resale_df = read_csv('data/aspatial/resale_hdb_price_for_kaggle_2024-30sep.csv')To get a brief overview of existing columns of this dataset, we can use colnames() to do so.
colnames(resale_df) [1] "...1" "month"
[3] "storey_range" "floor_area_sqm"
[5] "flat_model" "lease_commence_date"
[7] "remaining_lease" "resale_price"
[9] "floor_area_sqft" "price_per_sqft"
[11] "blk_no" "road_name"
[13] "building" "postal"
[15] "address" "lease_commence_date_r"
[17] "planning_area_ura" "region_ura"
[19] "x" "y"
[21] "latitude" "longitude"
[23] "closest_mrt_station" "distance_to_mrt_meters"
[25] "transport_type" "line_color"
[27] "distance_to_cbd" "closest_pri_school"
[29] "distance_to_pri_school_meters"
2.2.1.1 CRS Adjustments
Another important step after importing the dataset is checking the coordinate system used, as seen in the result below using st_crs(), we can see that there is no CRS.
st_crs(resale_df)Coordinate Reference System: NA
Therefore, we need to convert the longitude and latitude columns into a spatial format. Since our dataset is based in Singapore and it uses the SVY21 coordinate reference system (CRS Code: 3414), we will use the st_transform() function to perform the conversion and create the geometry column.
resale_sf <- resale_df %>%
st_as_sf(coords = c("longitude", "latitude"), crs=4326) %>%
st_transform(crs = 3414)Using st_crs(), we can check and verify that the conversion is successful.
st_crs(resale_sf)Coordinate Reference System:
User input: EPSG:3414
wkt:
PROJCRS["SVY21 / Singapore TM",
BASEGEOGCRS["SVY21",
DATUM["SVY21",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4757]],
CONVERSION["Singapore Transverse Mercator",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["northing (N)",north,
ORDER[1],
LENGTHUNIT["metre",1]],
AXIS["easting (E)",east,
ORDER[2],
LENGTHUNIT["metre",1]],
USAGE[
SCOPE["Cadastre, engineering survey, topographic mapping."],
AREA["Singapore - onshore and offshore."],
BBOX[1.13,103.59,1.47,104.07]],
ID["EPSG",3414]]
head(resale_df)# A tibble: 6 × 29
...1 month storey_range floor_area_sqm flat_model lease_commence_date
<dbl> <date> <chr> <dbl> <chr> <date>
1 0 2017-01-01 10 TO 12 44 Improved 1979-01-01
2 1 2017-01-01 01 TO 03 67 New Generati… 1978-01-01
3 2 2017-01-01 01 TO 03 67 New Generati… 1980-01-01
4 3 2017-01-01 04 TO 06 68 New Generati… 1980-01-01
5 4 2017-01-01 01 TO 03 67 New Generati… 1980-01-01
6 5 2017-01-01 01 TO 03 68 New Generati… 1981-01-01
# ℹ 23 more variables: remaining_lease <chr>, resale_price <dbl>,
# floor_area_sqft <dbl>, price_per_sqft <dbl>, blk_no <chr>, road_name <chr>,
# building <chr>, postal <chr>, address <chr>, lease_commence_date_r <date>,
# planning_area_ura <chr>, region_ura <chr>, x <dbl>, y <dbl>,
# latitude <dbl>, longitude <dbl>, closest_mrt_station <chr>,
# distance_to_mrt_meters <dbl>, transport_type <chr>, line_color <chr>,
# distance_to_cbd <dbl>, closest_pri_school <chr>, …
2.2.1.2 Checking for NA values
This chunk of code checks the dataset for any na values in all of the columns. As shown below, there is none.
resale_sf %>%
summarise(across(everything(), ~ sum(is.na(.)))) -> extra_NA
extra_NASimple feature collection with 1 feature and 27 fields
Geometry type: MULTIPOINT
Dimension: XY
Bounding box: xmin: 11519.15 ymin: 28097.64 xmax: 45192.3 ymax: 48741.06
Projected CRS: SVY21 / Singapore TM
# A tibble: 1 × 28
...1 month storey_range floor_area_sqm flat_model lease_commence_date
<int> <int> <int> <int> <int> <int>
1 0 0 0 0 0 0
# ℹ 22 more variables: remaining_lease <int>, resale_price <int>,
# floor_area_sqft <int>, price_per_sqft <int>, blk_no <int>, road_name <int>,
# building <int>, postal <int>, address <int>, lease_commence_date_r <int>,
# planning_area_ura <int>, region_ura <int>, x <int>, y <int>,
# closest_mrt_station <int>, distance_to_mrt_meters <int>,
# transport_type <int>, line_color <int>, distance_to_cbd <int>,
# closest_pri_school <int>, distance_to_pri_school_meters <int>, …
2.2.1.3 Time Period Selection
resale_sf <- resale_sf %>% filter (year(month) >= 2020)2.2.2 Importing HDB Property Information
hdb_info_df <- read_csv('data/aspatial/HDB Property Information.csv')3. Data Wrangling
3.1 Adding Housing Type for Flats
To assess whether the housing type influences the resale price of flats, we need to include this variable in our analysis. However, the current dataset lacks this information and thus we will be deriving it using other relevant variables. Specifically, we can use the floor area of the flats to determine the housing type, based on the guidelines provided on data.gov.sg.
2 Rm (2 room HDB Flat). 1 bedroom with a built-in area of about 45 sq m or 485 sq ft.
3 Rm (3 room HDB Flat). 2 bedrooms with a built-in area of about 70 sq m or 754 sq ft.
4 Rm (4 room HDB Flat). 3 bedrooms with a built-in area of about 90 sq m or 969 sq ft.
5 Rm (5 room HDB Flat). 3 bedrooms with a built-in area of about 110 sq m or 1,184 sq ft.
EA (Executive Apartment). 3/4 bedrooms with built-in area of about 150 sqm or 1,615 sqft.
EM (Executive Mansionette). Same as Executive apartment, except it has two levels.
6 Rm (6 room HDB Flat). Jumbo flat joint by two 3 room flats
Particularly, we will just group EA, EM and 6 rm flats into one category since there is no way for us right now to differentiate just by the size of the flats.
This chunk of code below derives the housing type for each flat by using conditional statements to check if the stated floor area is closer to that of a particular housing type and add the value in the new column created using mutate().
resale_sf <- resale_sf %>%
mutate(housing_type = case_when(
floor_area_sqft <= 619 ~ 2, # Closer to 485 sqft
floor_area_sqft > 619 & floor_area_sqft <= 862 ~ 3, # Closer to 754 sqft
floor_area_sqft > 862 & floor_area_sqft <= 1076 ~ 4, # Closer to 969 sqft
floor_area_sqft > 1076 & floor_area_sqft <= 1399 ~ 5, # Closer to 1184 sqft
floor_area_sqft > 1399 ~ 6, # Closer to 1615 sqft, EA/EM/6 Rm
))3.2 Remaining Lease
3.2.1 Grouping Remaining Lease By Range
First, let us convert the lease period from chr to total months. Below, we extract the different year and month from the string and then make the calculations to convert it to total months.
# Function to convert lease period to total months
convert_to_months <- function(lease) {
parts <- strsplit(lease, " ")[[1]] # Split by space
years <- as.numeric(parts[1]) # Extract years
# Check if months are present
if (length(parts) > 2) {
months <- as.numeric(parts[3]) # Extract months if present
} else {
months <- 0 # Set months to 0 if not present
}
total_months <- years * 12 + months # Convert to total months
return(total_months)
}
resale_sf <- resale_sf %>%
mutate(remaining_lease_total_months = sapply(remaining_lease, convert_to_months))Using the number of total months, we can then get a brief overview of the remaining lease of the resale flat including the mins and max. In this case, we can see that the minimum number of months is 497 (ard 41 years) and maximum is 1173 (around 97 years). This information will later be useful in setting the ranges for the remaining lease.
summary(resale_sf['remaining_lease_total_months']) remaining_lease_total_months geometry
Min. : 497.0 POINT :126468
1st Qu.: 746.0 epsg:3414 : 0
Median : 893.0 +proj=tmer...: 0
Mean : 894.3
3rd Qu.:1090.0
Max. :1173.0
Since the minimum is 41 years, our date range can start from 40-49 years onwards.
E.g. 40-49 years refers to 40 years 0 months to 49 years 11 months.
# Define the ranges and labels
breaks <- c(480, 600, 720, 840, 960, 1080, 1200)
labels <- c( "40-49 years", "50-59 years", "60-69 years", "70-79 years", "80-89 years", "90-99 years")
resale_sf <- resale_sf %>%
mutate(remaining_lease_range = cut(remaining_lease_total_months, breaks = breaks, labels = labels))3.2.2 Converting Remaining Lease to Year
# Function to convert lease period to total years
convert_to_years <- function(lease) {
parts <- strsplit(lease, " ")[[1]]
years <- as.numeric(parts[1])
# Check if months are present
if (length(parts) > 2) {
months <- as.numeric(parts[3])
} else {
months <- 0
}
total_years <- years + (months / 12) # Convert to total years
return(total_years)
}
# Apply the updated function
resale_sf <- resale_sf %>%
mutate(remaining_lease_years = sapply(remaining_lease, convert_to_years))3.3 Removal of Redundant Columns
# Define columns to be removed
columns_to_remove <- c("floor_area_sqm", "flat_model", "lease_commence_date", "remaining_lease", "blk_no", "road_name", "building", "address", "lease_commence_date_r", "postal","...1", "x", "y", "transport_type", "line_color")
# Remove columns only if they exist in the dataframe
resale_sf <- resale_sf %>%
dplyr::select(-all_of(columns_to_remove[columns_to_remove %in% names(resale_sf)]))3.4 Storey Ranges
3.4.1 Grouping of High Storeys
Through the histogram of the storey ranges, we can observe a right skewed distribution which shows that higher storeys are of a much lower frequency among resale flats. This is likely due to the fact that most HDB flats in Singapore are lower-rise buildings.
# Create a summary of counts for each remaining lease range
count_data <- resale_sf %>%
group_by(storey_range) %>%
summarise(count = n())
# Create the bar plot with labels
ggplot(count_data, aes(x = storey_range, y = count)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = count), vjust = -0.5, size = 4) + # Add labels on top of the bars
labs(title = "Count of Storey Range",
x = "Storey Range",
y = "Count") +
theme_minimal()Here is a summary of the maximum floor levels of all the HDB flats in Singapore. As shown here we can observe that both the mean and median maximum floor is a mere 12.
summary(hdb_info_df$max_floor_lvl) Min. 1st Qu. Median Mean 3rd Qu. Max.
1.0 6.0 12.0 12.1 16.0 50.0
Thus, to simplify our analysis and make the visualization easier to interpret, we can combine the higher storey ranges into a single broader range. In particular, we will group the floors 16 or higher together to form the 16+ floors range. As seen below, the results are 6 categories of storey ranges.
Note: storey_range_grouped is duplicated twice so that one column can be used for the subsequent dummy coding of the storey ranges which will later be replaced.
resale_sf <- resale_sf %>%
mutate(storey_range_grouped = case_when(
storey_range %in% c("01 TO 03", "04 TO 06", "07 TO 09", "10 TO 12", "13 TO 15") ~ storey_range,
storey_range %in% c("16 TO 18", "19 TO 21", "22 TO 24", "25 TO 27", "28 TO 30", "31 TO 33", "34 TO 36", "37 TO 39", "40 TO 42", "43 TO 45", "46 TO 48", "49 TO 51") ~ "16+",
TRUE ~ storey_range
)) %>%
mutate(storey_range_grouped_dummy = case_when(
storey_range %in% c("01 TO 03", "04 TO 06", "07 TO 09", "10 TO 12", "13 TO 15") ~ storey_range,
storey_range %in% c("16 TO 18", "19 TO 21", "22 TO 24", "25 TO 27", "28 TO 30", "31 TO 33", "34 TO 36", "37 TO 39", "40 TO 42", "43 TO 45", "46 TO 48", "49 TO 51") ~ "16+",
TRUE ~ storey_range
))
# Plot the grouped data
ggplot(resale_sf, aes(x = storey_range_grouped)) +
geom_bar() +
geom_text(stat='count', aes(label=..count..), vjust=-0.5) +
labs(title = "Count of Storey Range (Grouped)",
x = "Storey Range",
y = "Count") +
theme_minimal()3.4.2 Dummy Coding of Storey Ranges
To convert categorical variables into a numerical format that can be easily used in regression models, we can perform dummy coding which is basically statistical technique involving creating binary (0 or 1) variables for each category of a categorical variable, allowing the model to interpret and analyze categorical data.
For e.g. if a flat falls within the “01 TO 03” storey range, the corresponding dummy variable is coded as 1, and all others are coded as 0 ([1,0,0,0,0,0])
resale_sf <- resale_sf %>%
pivot_wider(
names_from = storey_range_grouped_dummy, values_from = storey_range_grouped_dummy,
values_fn = list(storey_range_grouped_dummy = ~1), values_fill = 0,
) 3.5 Calculate Number of Amenities Within 1km & Proximity To Nearest Amenity
calculate_amenities_and_proximity <- function(dataset1, dataset2, name_of_col_amenities, name_of_col_proximity, radius) {
# Calculate distance matrix
dist_matrix <- st_distance(dataset1, dataset2) %>%
drop_units()
# Calculate the number of amenities within the specified radius
dataset1[[name_of_col_amenities]] <- rowSums(dist_matrix <= radius)
# Calculate the proximity to the nearest amenity
dataset1[[name_of_col_proximity]] <- rowMins(dist_matrix)
return(dataset1)
}resale_sf <-
calculate_amenities_and_proximity(
resale_sf, kindergarten_sf, "no_of_kindergarten_500m", "prox_kindergarten", 500
) %>%
calculate_amenities_and_proximity(
., childcare_sf, "no_of_childcare_500m", "prox_childcare", 500
) %>%
calculate_amenities_and_proximity(
., hawker_sf, "no_of_hawker_500m", "prox_hawker", 500
) %>%
calculate_amenities_and_proximity(
., busstop_sf, "no_of_busstop_500m", "prox_busstop", 500
) %>%
calculate_amenities_and_proximity(
., shoppingmall_sf, "no_of_shoppingmall_1km", "prox_shoppingmall", 1000
)
# Writing to RDS
write_rds(resale_sf,'data/rds/resale_sf.rds')4. Overview Of Dataset
resale_sf <- read_rds('data/rds/resale_sf.rds')colnames(resale_sf) [1] "month" "storey_range"
[3] "resale_price" "floor_area_sqft"
[5] "price_per_sqft" "planning_area_ura"
[7] "region_ura" "closest_mrt_station"
[9] "distance_to_mrt_meters" "distance_to_cbd"
[11] "closest_pri_school" "distance_to_pri_school_meters"
[13] "geometry" "housing_type"
[15] "remaining_lease_total_months" "remaining_lease_range"
[17] "remaining_lease_years" "storey_range_grouped"
[19] "04 TO 06" "16+"
[21] "01 TO 03" "07 TO 09"
[23] "10 TO 12" "13 TO 15"
[25] "no_of_kindergarten_500m" "prox_kindergarten"
[27] "no_of_childcare_500m" "prox_childcare"
[29] "no_of_hawker_500m" "prox_hawker"
[31] "no_of_busstop_500m" "prox_busstop"
[33] "no_of_shoppingmall_1km" "prox_shoppingmall"
Explanatory Variables:
Continuous
Remaining Lease:
remaining_lease_total_monthsSize of flat:
floor_area_sqftDistance to transport:
distance_to_mrt_metersDistance to amenities:
distance_to_pri_school_metersDistance to central business district:
distance_to_cbdHousing Type:
housing_type
Categorical
Remaining Lease:
remaining_lease_rangeStorey Height:
storey_range
Dependent Variables:
- Resale Price:
resale_price,price_per_sqft
3. Shiny Storyboard (EDA)
3.1 Distribution
3.2
4. Distribution
4.1 Categorical Variables
4.1.1 Remaining Lease Range
# Create a summary of counts for each remaining lease range
count_data <- resale_sf %>%
group_by(remaining_lease_range) %>%
summarise(count = n())
# Create the bar plot with labels
ggplot(count_data, aes(x = remaining_lease_range, y = count)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = count), vjust = -0.5, size = 4) + # Add labels on top of the bars
labs(title = "Count of Remaining Lease Ranges",
x = "Remaining Lease Range",
y = "Count") +
theme_minimal()4.1.2 Storey Range
# Create a summary of counts for each remaining lease range
count_data <- resale_sf %>%
group_by(storey_range_grouped) %>%
summarise(count = n())
# Create the bar plot with labels
ggplot(count_data, aes(x = storey_range_grouped, y = count)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = count), vjust = -0.5, size = 4) + # Add labels on top of the bars
labs(title = "Count of Storey Range",
x = "Storey Range",
y = "Count") +
theme_minimal()4.1.3 Housing Type
# Create a summary of counts for each remaining lease range
count_data <- resale_sf %>%
group_by(housing_type) %>%
summarise(count = n())
# Create the bar plot with labels
ggplot(count_data, aes(x = housing_type, y = count)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = count), vjust = -0.5, size = 4) + # Add labels on top of the bars
labs(title = "Count of Housing Type",
x = "Housing Type",
y = "Count") +
theme_minimal()Bivariate Analysis
Correlation Matrix
Drafts
lease_storey_table <- table(resale_sf$storey_range_grouped, resale_sf$remaining_lease_range)
lease_storey_df <- as.data.frame(lease_storey_table)
# Heatmap with ggplot
ggplot(lease_storey_df, aes(Var1, Var2, fill = Freq)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red") +
labs(title = "Heatmap: Storey Range vs. Remaining Lease Range",
x = "Storey Range",
y = "Remaining Lease Range") +
theme_minimal()mpsz_sf_main <- st_union(mpsz_sf) %>%
st_cast("POLYGON")
mpsz_sf_main <- mpsz_sf_main[c(12)]
mpsz_sf_owin <- as.owin(mpsz_sf_main)plot(mpsz_sf_owin)resale_2020_sf <- resale_sf %>% filter (year(month) == 2020)
resale_2021_sf <- resale_sf %>% filter (year(month) == 2021)
resale_2022_sf <- resale_sf %>% filter (year(month) == 2022)
resale_2023_sf <- resale_sf %>% filter (year(month) == 2023)
resale_2024_sf <- resale_sf %>% filter (year(month) == 2024)tmap_mode('plot')
tm_shape(mpsz_sf%>% filter(PLN_AREA_N == 'ANG MO KIO'))+
tm_polygons()+
tm_shape(resale_2023_sf %>% filter(planning_area_ura == 'ANG MO KIO'))+
tm_dots()tmap_mode('plot')
tm_shape(mpsz_sf%>% filter(PLN_AREA_N == 'ANG MO KIO'))+
tm_polygons()+
tm_shape(resale_2024_sf %>% filter(planning_area_ura == 'ANG MO KIO'))+
tm_dots()test <- resale_sf %>% filter(housing_type == 'EA/EM/6 Rm')
testSimple feature collection with 0 features and 33 fields
Bounding box: xmin: NA ymin: NA xmax: NA ymax: NA
Projected CRS: SVY21 / Singapore TM
# A tibble: 0 × 34
# ℹ 34 variables: month <date>, storey_range <chr>, resale_price <dbl>,
# floor_area_sqft <dbl>, price_per_sqft <dbl>, planning_area_ura <chr>,
# region_ura <chr>, closest_mrt_station <chr>, distance_to_mrt_meters <dbl>,
# distance_to_cbd <dbl>, closest_pri_school <chr>,
# distance_to_pri_school_meters <dbl>, geometry <GEOMETRY [m]>,
# housing_type <dbl>, remaining_lease_total_months <dbl>,
# remaining_lease_range <fct>, remaining_lease_years <dbl>, …